data_dir = './downloads'
import os
os.listdir(data_dir)
import pandas as pd
hotel_raw_df = pd.read_csv('downloads/hotel_bookings.csv')
#Data Preparation and Cleaning
hotel_raw_df
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.00 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.00 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.00 | 0 | 1 | Check-Out | 2015-07-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 119385 | City Hotel | 0 | 23 | 2017 | August | 35 | 30 | 2 | 5 | 2 | ... | No Deposit | 394.0 | NaN | 0 | Transient | 96.14 | 0 | 0 | Check-Out | 2017-09-06 |
| 119386 | City Hotel | 0 | 102 | 2017 | August | 35 | 31 | 2 | 5 | 3 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 225.43 | 0 | 2 | Check-Out | 2017-09-07 |
| 119387 | City Hotel | 0 | 34 | 2017 | August | 35 | 31 | 2 | 5 | 2 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 157.71 | 0 | 4 | Check-Out | 2017-09-07 |
| 119388 | City Hotel | 0 | 109 | 2017 | August | 35 | 31 | 2 | 5 | 2 | ... | No Deposit | 89.0 | NaN | 0 | Transient | 104.40 | 0 | 0 | Check-Out | 2017-09-07 |
| 119389 | City Hotel | 0 | 205 | 2017 | August | 35 | 29 | 2 | 7 | 2 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 151.20 | 0 | 2 | Check-Out | 2017-09-07 |
119390 rows × 32 columns
hotel_raw_df.shape
(119390, 32)
hotel_raw_df.columns
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
'country', 'market_segment', 'distribution_channel',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
'company', 'days_in_waiting_list', 'customer_type', 'adr',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
hotel_raw_df.describe()
| is_canceled | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | booking_changes | agent | company | days_in_waiting_list | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119386.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 103050.000000 | 6797.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 |
| mean | 0.370416 | 104.011416 | 2016.156554 | 27.165173 | 15.798241 | 0.927599 | 2.500302 | 1.856403 | 0.103890 | 0.007949 | 0.031912 | 0.087118 | 0.137097 | 0.221124 | 86.693382 | 189.266735 | 2.321149 | 101.831122 | 0.062518 | 0.571363 |
| std | 0.482918 | 106.863097 | 0.707476 | 13.605138 | 8.780829 | 0.998613 | 1.908286 | 0.579261 | 0.398561 | 0.097436 | 0.175767 | 0.844336 | 1.497437 | 0.652306 | 110.774548 | 131.655015 | 17.594721 | 50.535790 | 0.245291 | 0.792798 |
| min | 0.000000 | 0.000000 | 2015.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 6.000000 | 0.000000 | -6.380000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 18.000000 | 2016.000000 | 16.000000 | 8.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 62.000000 | 0.000000 | 69.290000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 69.000000 | 2016.000000 | 28.000000 | 16.000000 | 1.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 179.000000 | 0.000000 | 94.575000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 160.000000 | 2017.000000 | 38.000000 | 23.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 229.000000 | 270.000000 | 0.000000 | 126.000000 | 0.000000 | 1.000000 |
| max | 1.000000 | 737.000000 | 2017.000000 | 53.000000 | 31.000000 | 19.000000 | 50.000000 | 55.000000 | 10.000000 | 10.000000 | 1.000000 | 26.000000 | 72.000000 | 21.000000 | 535.000000 | 543.000000 | 391.000000 | 5400.000000 | 8.000000 | 5.000000 |
hotel_raw_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hotel 119390 non-null object 1 is_canceled 119390 non-null int64 2 lead_time 119390 non-null int64 3 arrival_date_year 119390 non-null int64 4 arrival_date_month 119390 non-null object 5 arrival_date_week_number 119390 non-null int64 6 arrival_date_day_of_month 119390 non-null int64 7 stays_in_weekend_nights 119390 non-null int64 8 stays_in_week_nights 119390 non-null int64 9 adults 119390 non-null int64 10 children 119386 non-null float64 11 babies 119390 non-null int64 12 meal 119390 non-null object 13 country 118902 non-null object 14 market_segment 119390 non-null object 15 distribution_channel 119390 non-null object 16 is_repeated_guest 119390 non-null int64 17 previous_cancellations 119390 non-null int64 18 previous_bookings_not_canceled 119390 non-null int64 19 reserved_room_type 119390 non-null object 20 assigned_room_type 119390 non-null object 21 booking_changes 119390 non-null int64 22 deposit_type 119390 non-null object 23 agent 103050 non-null float64 24 company 6797 non-null float64 25 days_in_waiting_list 119390 non-null int64 26 customer_type 119390 non-null object 27 adr 119390 non-null float64 28 required_car_parking_spaces 119390 non-null int64 29 total_of_special_requests 119390 non-null int64 30 reservation_status 119390 non-null object 31 reservation_status_date 119390 non-null object dtypes: float64(4), int64(16), object(12) memory usage: 29.1+ MB
# It appears that a few columns contain some empty values since the Non-Null count for a few columns is lower than the total number of rows (119390). We’ll need to deal with empty values and manually adjust the data type for each column on a case-by-case basis.
#checking for null values
null_values = pd.DataFrame({'Null Values' : hotel_raw_df.isna().sum(), 'Percentage Null Values' : (hotel_raw_df.isna().sum()) / (hotel_raw_df.shape[0]) * (100)})
null_values
| Null Values | Percentage Null Values | |
|---|---|---|
| hotel | 0 | 0.000000 |
| is_canceled | 0 | 0.000000 |
| lead_time | 0 | 0.000000 |
| arrival_date_year | 0 | 0.000000 |
| arrival_date_month | 0 | 0.000000 |
| arrival_date_week_number | 0 | 0.000000 |
| arrival_date_day_of_month | 0 | 0.000000 |
| stays_in_weekend_nights | 0 | 0.000000 |
| stays_in_week_nights | 0 | 0.000000 |
| adults | 0 | 0.000000 |
| children | 4 | 0.003350 |
| babies | 0 | 0.000000 |
| meal | 0 | 0.000000 |
| country | 488 | 0.408744 |
| market_segment | 0 | 0.000000 |
| distribution_channel | 0 | 0.000000 |
| is_repeated_guest | 0 | 0.000000 |
| previous_cancellations | 0 | 0.000000 |
| previous_bookings_not_canceled | 0 | 0.000000 |
| reserved_room_type | 0 | 0.000000 |
| assigned_room_type | 0 | 0.000000 |
| booking_changes | 0 | 0.000000 |
| deposit_type | 0 | 0.000000 |
| agent | 16340 | 13.686238 |
| company | 112593 | 94.306893 |
| days_in_waiting_list | 0 | 0.000000 |
| customer_type | 0 | 0.000000 |
| adr | 0 | 0.000000 |
| required_car_parking_spaces | 0 | 0.000000 |
| total_of_special_requests | 0 | 0.000000 |
| reservation_status | 0 | 0.000000 |
| reservation_status_date | 0 | 0.000000 |
# check for missing values
hotel_raw_df.isnull().sum()
hotel 0 is_canceled 0 lead_time 0 arrival_date_year 0 arrival_date_month 0 arrival_date_week_number 0 arrival_date_day_of_month 0 stays_in_weekend_nights 0 stays_in_week_nights 0 adults 0 children 4 babies 0 meal 0 country 488 market_segment 0 distribution_channel 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 reserved_room_type 0 assigned_room_type 0 booking_changes 0 deposit_type 0 agent 16340 company 112593 days_in_waiting_list 0 customer_type 0 adr 0 required_car_parking_spaces 0 total_of_special_requests 0 reservation_status 0 reservation_status_date 0 dtype: int64
filter = (hotel_raw_df.children == 0) & (hotel_raw_df.adults == 0) & (hotel_raw_df.babies == 0)
hotel_raw_df[~filter]
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.00 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.00 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.00 | 0 | 1 | Check-Out | 2015-07-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 119385 | City Hotel | 0 | 23 | 2017 | August | 35 | 30 | 2 | 5 | 2 | ... | No Deposit | 394.0 | NaN | 0 | Transient | 96.14 | 0 | 0 | Check-Out | 2017-09-06 |
| 119386 | City Hotel | 0 | 102 | 2017 | August | 35 | 31 | 2 | 5 | 3 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 225.43 | 0 | 2 | Check-Out | 2017-09-07 |
| 119387 | City Hotel | 0 | 34 | 2017 | August | 35 | 31 | 2 | 5 | 2 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 157.71 | 0 | 4 | Check-Out | 2017-09-07 |
| 119388 | City Hotel | 0 | 109 | 2017 | August | 35 | 31 | 2 | 5 | 2 | ... | No Deposit | 89.0 | NaN | 0 | Transient | 104.40 | 0 | 0 | Check-Out | 2017-09-07 |
| 119389 | City Hotel | 0 | 205 | 2017 | August | 35 | 29 | 2 | 7 | 2 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 151.20 | 0 | 2 | Check-Out | 2017-09-07 |
119210 rows × 32 columns
hotel_raw_df.sample(10)
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 95587 | City Hotel | 0 | 82 | 2016 | August | 34 | 19 | 0 | 2 | 3 | ... | No Deposit | 9.0 | NaN | 0 | Transient | 150.30 | 0 | 3 | Check-Out | 2016-08-21 |
| 34077 | Resort Hotel | 0 | 115 | 2017 | March | 10 | 9 | 0 | 3 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 65.00 | 0 | 1 | Check-Out | 2017-03-12 |
| 3913 | Resort Hotel | 1 | 72 | 2016 | February | 6 | 6 | 2 | 1 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient-Party | 73.33 | 0 | 0 | Canceled | 2016-01-19 |
| 23458 | Resort Hotel | 0 | 15 | 2016 | April | 17 | 23 | 2 | 1 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 93.67 | 0 | 1 | Check-Out | 2016-04-26 |
| 24945 | Resort Hotel | 0 | 8 | 2016 | June | 24 | 8 | 0 | 2 | 2 | ... | No Deposit | NaN | 154.0 | 0 | Transient | 66.00 | 0 | 0 | Check-Out | 2016-06-10 |
| 4750 | Resort Hotel | 0 | 169 | 2016 | March | 14 | 31 | 0 | 3 | 2 | ... | Refundable | NaN | 223.0 | 0 | Transient-Party | 66.00 | 0 | 0 | Check-Out | 2016-04-03 |
| 24483 | Resort Hotel | 0 | 165 | 2016 | May | 22 | 22 | 1 | 0 | 2 | ... | No Deposit | 251.0 | NaN | 0 | Transient | 59.00 | 0 | 1 | Check-Out | 2016-05-23 |
| 43259 | City Hotel | 0 | 56 | 2015 | September | 38 | 17 | 0 | 2 | 2 | ... | No Deposit | 20.0 | NaN | 0 | Transient-Party | 82.00 | 0 | 0 | Check-Out | 2015-09-19 |
| 33976 | Resort Hotel | 0 | 191 | 2017 | February | 8 | 23 | 4 | 10 | 2 | ... | No Deposit | 96.0 | NaN | 0 | Transient | 47.00 | 0 | 1 | Check-Out | 2017-03-09 |
| 3214 | Resort Hotel | 0 | 78 | 2015 | November | 47 | 21 | 0 | 1 | 2 | ... | No Deposit | NaN | 342.0 | 0 | Transient-Party | 32.00 | 0 | 0 | Check-Out | 2015-11-22 |
10 rows × 32 columns
# Exploratory Analysis and Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
hotel_raw_df.country
0 PRT
1 PRT
2 GBR
3 GBR
4 GBR
...
119385 BEL
119386 FRA
119387 DEU
119388 GBR
119389 DEU
Name: country, Length: 119390, dtype: object
# the number of countries from which the total bookings are made
hotel_raw_df.country.nunique()
177
top_countries_with_codes = hotel_raw_df.country.value_counts().head(10)
top_countries_with_codes
PRT 48590 GBR 12129 FRA 10415 ESP 8568 DEU 7287 ITA 3766 IRL 3375 BEL 2342 BRA 2224 NLD 2104 Name: country, dtype: int64
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top 10 countries from where these hotels are recieving guests')
sns.barplot(x=top_countries_with_codes.index, y=top_countries_with_codes);
url= 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv'
country_code = pd.read_csv(url)
country_code
| name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 1 | Åland Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | NaN | 150.0 | 154.0 | NaN |
| 2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
| 4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 244 | Wallis and Futuna | WF | WLF | 876 | ISO 3166-2:WF | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN |
| 245 | Western Sahara | EH | ESH | 732 | ISO 3166-2:EH | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
| 246 | Yemen | YE | YEM | 887 | ISO 3166-2:YE | Asia | Western Asia | NaN | 142.0 | 145.0 | NaN |
| 247 | Zambia | ZM | ZMB | 894 | ISO 3166-2:ZM | Africa | Sub-Saharan Africa | Eastern Africa | 2.0 | 202.0 | 14.0 |
| 248 | Zimbabwe | ZW | ZWE | 716 | ISO 3166-2:ZW | Africa | Sub-Saharan Africa | Eastern Africa | 2.0 | 202.0 | 14.0 |
249 rows × 11 columns
country_code['country'] = country_code['alpha-3'].str.replace('"','')
country_code
| name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN | AFG |
| 1 | Åland Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | NaN | 150.0 | 154.0 | NaN | ALA |
| 2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN | ALB |
| 3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN | DZA |
| 4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN | ASM |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 244 | Wallis and Futuna | WF | WLF | 876 | ISO 3166-2:WF | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN | WLF |
| 245 | Western Sahara | EH | ESH | 732 | ISO 3166-2:EH | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN | ESH |
| 246 | Yemen | YE | YEM | 887 | ISO 3166-2:YE | Asia | Western Asia | NaN | 142.0 | 145.0 | NaN | YEM |
| 247 | Zambia | ZM | ZMB | 894 | ISO 3166-2:ZM | Africa | Sub-Saharan Africa | Eastern Africa | 2.0 | 202.0 | 14.0 | ZMB |
| 248 | Zimbabwe | ZW | ZWE | 716 | ISO 3166-2:ZW | Africa | Sub-Saharan Africa | Eastern Africa | 2.0 | 202.0 | 14.0 | ZWE |
249 rows × 12 columns
country_code['country'] = country_code['country'].str.replace(' ','')
country_code['country']
0 AFG
1 ALA
2 ALB
3 DZA
4 ASM
...
244 WLF
245 ESH
246 YEM
247 ZMB
248 ZWE
Name: country, Length: 249, dtype: object
hotel_df = pd.merge(hotel_raw_df, country_code)
hotel_df
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | PT | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | PT | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 2 | Resort Hotel | 0 | 0 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | PT | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 3 | Resort Hotel | 0 | 9 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | PT | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 4 | Resort Hotel | 1 | 85 | 2015 | July | 27 | 1 | 0 | 3 | 2 | ... | PT | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 117615 | City Hotel | 0 | 6 | 2017 | June | 24 | 11 | 2 | 1 | 2 | ... | SD | SDN | 729 | ISO 3166-2:SD | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
| 117616 | City Hotel | 0 | 116 | 2017 | July | 27 | 2 | 2 | 0 | 2 | ... | TF | ATF | 260 | ISO 3166-2:TF | Africa | Sub-Saharan Africa | Eastern Africa | 2.0 | 202.0 | 14.0 |
| 117617 | City Hotel | 0 | 84 | 2017 | July | 30 | 27 | 2 | 5 | 2 | ... | SL | SLE | 694 | ISO 3166-2:SL | Africa | Sub-Saharan Africa | Western Africa | 2.0 | 202.0 | 11.0 |
| 117618 | City Hotel | 0 | 2 | 2017 | August | 32 | 9 | 0 | 3 | 2 | ... | LA | LAO | 418 | ISO 3166-2:LA | Asia | South-eastern Asia | NaN | 142.0 | 35.0 | NaN |
| 117619 | City Hotel | 0 | 3 | 2017 | August | 32 | 10 | 0 | 2 | 2 | ... | LA | LAO | 418 | ISO 3166-2:LA | Asia | South-eastern Asia | NaN | 142.0 | 35.0 | NaN |
117620 rows × 43 columns
top_countries = hotel_df.name.value_counts().head(10)
top_countries
Portugal 48590 United Kingdom of Great Britain and Northern Ireland 12129 France 10415 Spain 8568 Germany 7287 Italy 3766 Ireland 3375 Belgium 2342 Brazil 2224 Netherlands 2104 Name: name, dtype: int64
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top 10 countries from where these hotels are receiving guests')
sns.barplot(x=top_countries.index, y=top_countries);
It appears that a disproportionately high number of bookings are from Portugal, probably because the hotel is located in Portugal itself. The second country is the United Kingdom which is approx. 75% behind.
#Country wise guest (Let’s look at the country-wise guests that both the hotels in total have received.)
country_wise_guests = hotel_df[hotel_df['is_canceled'] == 0]['name'].value_counts().reset_index()
country_wise_guests.columns = ['name', 'No of guests']
country_wise_guests
| name | No of guests | |
|---|---|---|
| 0 | Portugal | 21071 |
| 1 | United Kingdom of Great Britain and Northern I... | 9676 |
| 2 | France | 8481 |
| 3 | Spain | 6391 |
| 4 | Germany | 6069 |
| ... | ... | ... |
| 158 | Bahamas | 1 |
| 159 | Tajikistan | 1 |
| 160 | Macao | 1 |
| 161 | Djibouti | 1 |
| 162 | Bahrain | 1 |
163 rows × 2 columns
top_countries_guests = country_wise_guests.head(10)
top_countries_guests
| name | No of guests | |
|---|---|---|
| 0 | Portugal | 21071 |
| 1 | United Kingdom of Great Britain and Northern I... | 9676 |
| 2 | France | 8481 |
| 3 | Spain | 6391 |
| 4 | Germany | 6069 |
| 5 | Ireland | 2543 |
| 6 | Italy | 2433 |
| 7 | Belgium | 1868 |
| 8 | Netherlands | 1717 |
| 9 | United States of America | 1596 |
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top 10 countries from where these hotels are receiving guests')
sns.barplot(x=top_countries_guests.name, y=top_countries_guests['No of guests']);
resort_hotel = hotel_df.loc[(hotel_df["hotel"] == "Resort Hotel") & (hotel_df["is_canceled"] == 0)]
city_hotel = hotel_df.loc[(hotel_df["hotel"] == "City Hotel") & (hotel_df["is_canceled"] == 0)]
# Counting adults and children as paying guests only, not babies.
resort_hotel["adr_pp"] = resort_hotel["adr"] / (resort_hotel["adults"] + resort_hotel["children"])
/var/folders/qm/5x1g9wd51qv_hrcv4kk_g3f80000gn/T/ipykernel_1836/1572811521.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy resort_hotel["adr_pp"] = resort_hotel["adr"] / (resort_hotel["adults"] + resort_hotel["children"])
import numpy as np
resort_hotel_filtered = resort_hotel.replace([np.inf, -np.inf], 0)
city_hotel["adr_pp"] = city_hotel["adr"] / (city_hotel["adults"] + city_hotel["children"])
city_hotel_filtered = city_hotel.replace([np.inf, -np.inf], 0)
/var/folders/qm/5x1g9wd51qv_hrcv4kk_g3f80000gn/T/ipykernel_1836/2552075327.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy city_hotel["adr_pp"] = city_hotel["adr"] / (city_hotel["adults"] + city_hotel["children"])
print("""From all non-cnceled bookings, across all room types and meals, the average prices are:
Resort hotel: {:.2f} € per night and person.
City hotel: {:.2f} € per night and person."""
.format(resort_hotel_filtered["adr_pp"].mean(), city_hotel_filtered["adr_pp"].mean()))
From all non-cnceled bookings, across all room types and meals, the average prices are: Resort hotel: 47.50 € per night and person. City hotel: 59.26 € per night and person.
# normalize price per night (adr):
hotel_df["adr_pp"] = hotel_df["adr"] / (hotel_df["adults"] + hotel_df["children"])
hotel_df_guests = hotel_df.loc[hotel_df["is_canceled"] == 0]
hotel_df_guests
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | adr_pp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN | 0.000 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN | 0.000 |
| 2 | Resort Hotel | 0 | 0 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN | 53.500 |
| 3 | Resort Hotel | 0 | 9 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN | 51.500 |
| 7 | Resort Hotel | 0 | 35 | 2015 | July | 27 | 1 | 0 | 4 | 2 | ... | PRT | 620 | ISO 3166-2:PT | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN | 72.500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 117615 | City Hotel | 0 | 6 | 2017 | June | 24 | 11 | 2 | 1 | 2 | ... | SDN | 729 | ISO 3166-2:SD | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN | 49.500 |
| 117616 | City Hotel | 0 | 116 | 2017 | July | 27 | 2 | 2 | 0 | 2 | ... | ATF | 260 | ISO 3166-2:TF | Africa | Sub-Saharan Africa | Eastern Africa | 2.0 | 202.0 | 14.0 | 63.000 |
| 117617 | City Hotel | 0 | 84 | 2017 | July | 30 | 27 | 2 | 5 | 2 | ... | SLE | 694 | ISO 3166-2:SL | Africa | Sub-Saharan Africa | Western Africa | 2.0 | 202.0 | 11.0 | 60.715 |
| 117618 | City Hotel | 0 | 2 | 2017 | August | 32 | 9 | 0 | 3 | 2 | ... | LAO | 418 | ISO 3166-2:LA | Asia | South-eastern Asia | NaN | 142.0 | 35.0 | NaN | 91.665 |
| 117619 | City Hotel | 0 | 3 | 2017 | August | 32 | 10 | 0 | 2 | 2 | ... | LAO | 418 | ISO 3166-2:LA | Asia | South-eastern Asia | NaN | 142.0 | 35.0 | NaN | 90.000 |
73718 rows × 44 columns
# only actual gusts
room_prices = hotel_df_guests[["hotel", "reserved_room_type", "adr_pp"]].sort_values("reserved_room_type")
room_prices
| hotel | reserved_room_type | adr_pp | |
|---|---|---|---|
| 69507 | City Hotel | A | 152.000000 |
| 65981 | Resort Hotel | A | 14.500000 |
| 65982 | Resort Hotel | A | 14.500000 |
| 96679 | City Hotel | A | 49.500000 |
| 65984 | Resort Hotel | A | 27.000000 |
| ... | ... | ... | ... |
| 13296 | Resort Hotel | H | 64.666667 |
| 183 | Resort Hotel | L | 37.500000 |
| 8934 | Resort Hotel | L | 86.000000 |
| 8163 | Resort Hotel | L | 82.000000 |
| 582 | Resort Hotel | L | 100.000000 |
73718 rows × 3 columns
# boxplot:
plt.figure(figsize=(14, 10))
sns.boxplot(x="reserved_room_type",
y="adr_pp",
hue="hotel",
data=room_prices,
hue_order=["City Hotel", "Resort Hotel"],
fliersize=0)
plt.title("Price of room types per night and person", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 160)
plt.show()
# grab data:
room_prices_monthly = hotel_df_guests[["hotel", "arrival_date_month", "adr_pp"]].sort_values("arrival_date_month")
room_prices_monthly
| hotel | arrival_date_month | adr_pp | |
|---|---|---|---|
| 90695 | City Hotel | April | 45.475 |
| 96661 | City Hotel | April | 49.500 |
| 96660 | City Hotel | April | 49.500 |
| 96659 | City Hotel | April | 81.000 |
| 96658 | City Hotel | April | 67.500 |
| ... | ... | ... | ... |
| 95721 | City Hotel | September | 63.000 |
| 95722 | City Hotel | September | 77.500 |
| 95723 | City Hotel | September | 63.000 |
| 19345 | City Hotel | September | 109.250 |
| 75196 | Resort Hotel | September | 64.000 |
73718 rows × 3 columns
# order by month:
ordered_months = ["January", "February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"]
room_prices_monthly["arrival_date_month"] = pd.Categorical(room_prices_monthly["arrival_date_month"], categories=ordered_months, ordered=True)
# barplot with standard deviation:
plt.figure(figsize=(12, 8))
sns.lineplot(x = "arrival_date_month", y="adr_pp", hue="hotel", data=room_prices_monthly,
hue_order = ["City Hotel", "Resort Hotel"], ci="sd", size="hotel", sizes=(2, 2))
plt.title("Room price per night and person over the year", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=90)
plt.ylabel("Price [EUR]", fontsize=16)
plt.show()
#Q1: How many guests are there who have repeatedly canceled the bookings?
hotel_df['is_repeated_guest'] = hotel_df['is_repeated_guest'].replace(0,'No')
hotel_df['is_repeated_guest'] = hotel_df['is_repeated_guest'].replace(1,'Yes')
hotel_df['is_repeated_guest']
0 No
1 No
2 No
3 No
4 No
..
117615 No
117616 No
117617 No
117618 No
117619 No
Name: is_repeated_guest, Length: 117620, dtype: object
sns.set(style = "whitegrid")
plt.title("Canceled or not?", fontdict = {'fontsize': 30})
canceled = sns.countplot(x = hotel_df.hotel, hue = 'is_repeated_guest', data = hotel_df)
#Q2: Which hotel people like to stay and spend more time? (We might see a skew distribution here, it is best that we check this on weekdays and weekends separately)
plt.figure(figsize = (15,10))
sns.boxplot(x = "market_segment", y = "stays_in_week_nights", data = hotel_df, hue = "hotel", palette = 'Set1');
plt.figure(figsize=(15,10))
sns.boxplot(x = "market_segment", y = "stays_in_weekend_nights", data = hotel_df, hue = "hotel", palette = 'Set1');
#Q3: What are the effects of deposit on cancellations by segments.
plt.figure(figsize = (15,10))
sns.set(style = "whitegrid")
plt.title("Countplot Distrubiton of Segment by Deposit Type", fontdict = {'fontsize':20})
ax = sns.countplot(x = "market_segment", hue = 'deposit_type', data = hotel_df)
plt.figure(figsize = (15,10))
sns.set(style = "darkgrid")
plt.title("Countplot Distributon of Segments by Cancellation", fontdict = {'fontsize':20})
ax = sns.countplot(x = "market_segment", hue = 'is_canceled', data = hotel_df)
#Q4: What is the relationship of lead time with cancellation.
hotel_df['is_canceled'] = hotel_df['is_canceled'].replace(0,'No')
hotel_df['is_canceled'] = hotel_df['is_canceled'].replace(1,'Yes')
(sns.FacetGrid(hotel_df, hue = 'is_canceled',
height = 6,
xlim = (0,500))
.map(sns.kdeplot, 'lead_time', shade = True)
.add_legend());
#Q5: What is the monthly customers each hotel is receiving?
plt.figure(figsize =(15,10))
sns.set(style="whitegrid")
plt.title("Total Customers - Monthly ", fontdict={'fontsize': 20})
ax = sns.countplot(x = "arrival_date_month", hue = 'hotel', data = hotel_df)
#Q6: Which are the most busy month?
# Create a DateFrame with the relevant data:
resort_guests_monthly = resort_hotel.groupby("arrival_date_month")["hotel"].count()
city_guests_monthly = city_hotel.groupby("arrival_date_month")["hotel"].count()
resort_guest_data = pd.DataFrame({"month": list(resort_guests_monthly.index),
"hotel": "Resort hotel",
"guests": list(resort_guests_monthly.values)})
city_guest_data = pd.DataFrame({"month": list(city_guests_monthly.index),
"hotel": "City hotel",
"guests": list(city_guests_monthly.values)})
full_guest_data = pd.concat([resort_guest_data,city_guest_data], ignore_index=True)
# order by month:
ordered_months = ["January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"]
full_guest_data["month"] = pd.Categorical(full_guest_data["month"], categories=ordered_months, ordered=True)
# Dataset contains July and August date from 3 years, the other month from 2 years. Normalize data:
full_guest_data.loc[(full_guest_data["month"] == "July") | (full_guest_data["month"] == "August"),
"guests"] /= 3
full_guest_data.loc[~((full_guest_data["month"] == "July") | (full_guest_data["month"] == "August")),
"guests"] /= 2
#show figure:
plt.figure(figsize=(13, 8))
sns.lineplot(x = "month", y="guests", hue="hotel", data=full_guest_data,
hue_order = ["City hotel", "Resort hotel"], size="hotel", sizes=(2.5, 2.5))
plt.title("Average number of hotel guests per month", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=90)
plt.ylabel("Number of guests", fontsize=16)
plt.show()
# absolute cancelations:
total_cancelations = hotel_raw_df["is_canceled"].sum()
resort_cancelations = hotel_raw_df.loc[hotel_raw_df["hotel"] == "Resort Hotel"]["is_canceled"].sum()
city_cancelations = hotel_raw_df.loc[hotel_raw_df["hotel"] == "City Hotel"]["is_canceled"].sum()
# as percent:
rel_cancel = total_cancelations / hotel_df.shape[0] * 100
rh_rel_cancel = resort_cancelations / hotel_raw_df.loc[hotel_raw_df["hotel"] == "Resort Hotel"].shape[0] * 100
ch_rel_cancel = city_cancelations / hotel_raw_df.loc[hotel_raw_df["hotel"] == "City Hotel"].shape[0] * 100
print(f"Total bookings canceled: {total_cancelations:,} ({rel_cancel:.0f} %)")
print(f"Resort hotel bookings canceled: {resort_cancelations:,} ({rh_rel_cancel:.0f} %)")
print(f"City hotel bookings canceled: {city_cancelations:,} ({ch_rel_cancel:.0f} %)")
Total bookings canceled: 44,224 (38 %) Resort hotel bookings canceled: 11,122 (28 %) City hotel bookings canceled: 33,102 (42 %)